Registration number: 19BCE1717
Faculty: Prof. Parvathi R
Slot: L55 + L56
Course code: CSE3020
Instructions: Other than IRIS dataset, use a dataset to perform clustering and visualise the same
I have used CRABS datase from the MASS package.
rm(list=ls())
library(MASS)
data("crabs") # load crabs Dataset
str(crabs) #view structure of dataset
## 'data.frame': 200 obs. of 8 variables:
## $ sp : Factor w/ 2 levels "B","O": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ index: int 1 2 3 4 5 6 7 8 9 10 ...
## $ FL : num 8.1 8.8 9.2 9.6 9.8 10.8 11.1 11.6 11.8 11.8 ...
## $ RW : num 6.7 7.7 7.8 7.9 8 9 9.9 9.1 9.6 10.5 ...
## $ CL : num 16.1 18.1 19 20.1 20.3 23 23.8 24.5 24.2 25.2 ...
## $ CW : num 19 20.8 22.4 23.1 23 26.5 27.1 28.4 27.8 29.3 ...
## $ BD : num 7 7.4 7.7 8.2 8.2 9.8 9.8 10.4 9.7 10.3 ...
#2. Display the Statistical Summary of the dataset
summary(crabs) #view statistical summary of dataset
## sp sex index FL RW CL
## B:100 F:100 Min. : 1.0 Min. : 7.20 Min. : 6.50 Min. :14.70
## O:100 M:100 1st Qu.:13.0 1st Qu.:12.90 1st Qu.:11.00 1st Qu.:27.27
## Median :25.5 Median :15.55 Median :12.80 Median :32.10
## Mean :25.5 Mean :15.58 Mean :12.74 Mean :32.11
## 3rd Qu.:38.0 3rd Qu.:18.05 3rd Qu.:14.30 3rd Qu.:37.23
## Max. :50.0 Max. :23.10 Max. :20.20 Max. :47.60
## CW BD
## Min. :17.10 Min. : 6.10
## 1st Qu.:31.50 1st Qu.:11.40
## Median :36.80 Median :13.90
## Mean :36.41 Mean :14.03
## 3rd Qu.:42.00 3rd Qu.:16.60
## Max. :54.60 Max. :21.60
head(crabs) #view top rows of dataset
## sp sex index FL RW CL CW BD
## 1 B M 1 8.1 6.7 16.1 19.0 7.0
## 2 B M 2 8.8 7.7 18.1 20.8 7.4
## 3 B M 3 9.2 7.8 19.0 22.4 7.7
## 4 B M 4 9.6 7.9 20.1 23.1 8.2
## 5 B M 5 9.8 8.0 20.3 23.0 8.2
## 6 B M 6 10.8 9.0 23.0 26.5 9.8
crabs.new <- crabs[,c(3,4,5,6,7,8)]
crabs.class <- crabs[,"sp"]
head(crabs.new)
## index FL RW CL CW BD
## 1 1 8.1 6.7 16.1 19.0 7.0
## 2 2 8.8 7.7 18.1 20.8 7.4
## 3 3 9.2 7.8 19.0 22.4 7.7
## 4 4 9.6 7.9 20.1 23.1 8.2
## 5 5 9.8 8.0 20.3 23.0 8.2
## 6 6 10.8 9.0 23.0 26.5 9.8
head(crabs.class)
## [1] B B B B B B
## Levels: B O
#4. Normalisation
# Create a function to normalize the data before clustering
normalize <- function(x){
return ((x-min(x))/(max(x)-min(x)))
}
crabs.new$FL<- normalize(crabs.new$FL)
crabs.new$RW<- normalize(crabs.new$RW)
crabs.new$CL<- normalize(crabs.new$CL)
crabs.new$CW<- normalize(crabs.new$CW)
crabs.new$BD<- normalize(crabs.new$BD)
crabs.new$index<- normalize(crabs.new$index)
head(crabs.new)
## index FL RW CL CW BD
## 1 0.00000000 0.05660377 0.01459854 0.04255319 0.05066667 0.05806452
## 2 0.02040816 0.10062893 0.08759124 0.10334347 0.09866667 0.08387097
## 3 0.04081633 0.12578616 0.09489051 0.13069909 0.14133333 0.10322581
## 4 0.06122449 0.15094340 0.10218978 0.16413374 0.16000000 0.13548387
## 5 0.08163265 0.16352201 0.10948905 0.17021277 0.15733333 0.13548387
## 6 0.10204082 0.22641509 0.18248175 0.25227964 0.25066667 0.23870968
result<- kmeans(crabs.new,2) #aplly k-means algorithm with no. of centroids(k)=2
result$size # gives no. of records in each cluster
## [1] 99 101
result$centers # gives value of cluster center datapoint value(2 centers for k=2)
## index FL RW CL CW BD
## 1 0.7485055 0.7037037 0.6003834 0.7066409 0.6891313 0.6889541
## 2 0.2564154 0.3542562 0.3132182 0.3549610 0.3444224 0.3378473
result$cluster #gives cluster vector showing the cluster where each record falls
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
## 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2
## 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
## 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1
## 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
# Verify results of clustering
par(mfrow=c(2,2), mar=c(5,4,2,2))
#9. Plot to see how FL and RW data points have been distributed in clusters
plot(crabs.new[c(2,3)], col=result$cluster)
plot(crabs.new[c(2,3)], col=crabs.class)
plot(crabs.new[c(4,5)], col=result$cluster)
plot(crabs.new[c(4,5)], col=crabs.class)
result$cluster <- as.factor(result$cluster)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
ggplot(crabs.new, aes(FL, RW, color = result$cluster)) + geom_point()
plot(crabs.new[c("FL", "RW")], col=result$cluster)
plot(crabs.new[,], col=result$cluster)
table(result$cluster,crabs.class) # Result of table shows that Cluster 1 corresponds to B, and Cluster 2 corresponds to O
## crabs.class
## B O
## 1 42 57
## 2 58 43
library(animation)
## Warning: package 'animation' was built under R version 4.0.2
km1<-kmeans.ani(crabs.new,2)
library(factoextra) # clustering algorithms & visualization
## Warning: package 'factoextra' was built under R version 4.0.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(result, data = crabs.new)
k2 <- kmeans(crabs.new, centers = 2, nstart = 25)
k3 <- kmeans(crabs.new, centers = 3, nstart = 25)
k4 <- kmeans(crabs.new, centers = 4, nstart = 25)
k5 <- kmeans(crabs.new, centers = 5, nstart = 25)
p1 <- fviz_cluster(k2, geom = "point", data = crabs.new) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = crabs.new) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = crabs.new) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = crabs.new) + ggtitle("k = 5")
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.0.2
grid.arrange(p1, p2, p3, p4, nrow = 2)
Instructions: Train a decision tree model and visualise the tree
data <- crabs
shuffle_index <- sample(1:nrow(data))
data <- data[shuffle_index, ]
head(data)
## sp sex index FL RW CL CW BD
## 19 B M 19 13.3 11.1 27.8 32.3 11.3
## 163 O F 13 15.6 14.0 31.6 35.3 13.8
## 190 O F 40 20.1 17.2 39.8 44.1 18.6
## 78 B F 28 13.7 12.5 28.6 33.8 11.9
## 23 B M 23 15.0 10.9 31.4 36.4 13.2
## 12 B M 12 12.3 11.0 26.8 31.5 11.4
create_train_test <- function(d, size = 0.8, train = TRUE) {
n_row = nrow(d)
total_row = size * n_row
train_sample <- 1: total_row
if (train == TRUE) {
return (d[train_sample, ])
} else {
return (d[-train_sample, ])
}
}
# Dataset is split into train and test
data_train <- create_train_test(data, 0.8, train = TRUE)
data_test <- create_train_test(data, 0.8, train = FALSE)
dim(data_train)
## [1] 160 8
dim(data_test)
## [1] 40 8
8 columns and 40 rows
prop.table(table(data_train$sp))
##
## B O
## 0.5 0.5
prop.table(table(data_test$sp))
##
## B O
## 0.5 0.5
We can see that the distribution of B and O classes are almost similar. Hence we can proceed without having to do Sampling of the dataset.
library(rpart)
## Warning: package 'rpart' was built under R version 4.0.2
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.0.2
fit <- rpart(sp~., data = data_train, method = 'class')
rpart.plot(fit, extra = 106)
The Decition tree is shown above. The model was generated and the output is as follows.
predict_unseen <-predict(fit, data_test, type = 'class')
table_mat <- table(data_test$sp, predict_unseen)
table_mat
## predict_unseen
## B O
## B 18 2
## O 4 16
We see that the first and last cells show the right predictions (True positives and negatives)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.85"
Thus, our model is able to describe most of the dataset accurately.